mappings_in_path <- here("data/processed/childes/all_types_norm_mappings.csv")
tokens_in_path <- here("data/processed/childes/all_tokens_post-norm.csv")


mappings <- read_csv(mappings_in_path)
tokens_raw <- read_csv(tokens_in_path)

Grouping by corpus AND child

by_kid1 <- tokens_raw %>% 
  #mutate(., child_id = group_indices(., corpus, child)) %>% 
  group_by(corpus, child) %>% 
  mutate(corpus_child = paste(corpus, child, sep = "_")) %>% 
  ungroup()

by_kid2 <- by_kid1 %>% 
  dplyr::select(corpus_child, word) %>% 
  group_by(corpus_child) %>% 
  add_count(word) %>% 
  distinct(corpus_child, word, .keep_all = TRUE) %>% 
  ungroup()


by_kid3 <- by_kid2 %>% 
  group_by(corpus_child) %>% 
  add_tally(n) %>% 
  ungroup() %>% 
  rename(N = nn) %>% 
  mutate(freq = n/N,
         trns_freq = log10(freq + 1))
td_matrix_bykid <- by_kid3 %>% 
  dplyr::select(word, corpus_child, trns_freq) %>% 
  spread(key = corpus_child, value = trns_freq, fill = 0)


M_bykid <- td_matrix_bykid %>% 
  dplyr::select(-word) %>% 
  cor()
corrplot(round(M_bykid[1:10, 1:10], 2), method = 'number', tl.srt = 45)

nm_mds_bykid <- isoMDS(d = 1 - M_bykid, k = 2)
## initial  value 29.894866 
## iter   5 value 19.726697
## iter  10 value 16.273466
## iter  15 value 15.962877
## iter  20 value 15.806430
## iter  20 value 15.790962
## iter  20 value 15.779068
## final  value 15.779068 
## converged
coords_bykid <- nm_mds_bykid$points %>% 
  as.data.frame() %>% 
  rename(x = V1, y = V2) %>% 
  rownames_to_column(var = "corpus_child") %>% 
  separate(corpus_child, c("corpus", "child"), sep = "_", remove = FALSE) 


coords_bykid <- coords_bykid %>% 
  mutate(corpus = ifelse(corpus == "MacWhinney", "McW",
                         ifelse(corpus == "EllisWeismer", "EW",
                                corpus)))
ggplot(coords_bykid, aes(x, y, label = corpus, color = corpus)) + 
  geom_text() +
  theme_minimal() +
  guides(color = FALSE) +
  coord_cartesian(xlim = c(-2, 1.5), ylim = c(-3.5, 1))

ggplot(coords_bykid, aes(x, y)) +
  geom_bin2d(binwidth = c(0.2, 0.2)) +
  scale_fill_continuous(low = "lavender", high = "darkslategray4") +
  theme_minimal() +
  labs(title = '"Heatmap of 2d bin counts"', caption = "binwidth = 0.2 x 0.2") +
  coord_cartesian(xlim = c(-2, 1.5), ylim = c(-3.5, 1))

ggplot(coords_bykid, aes(x, y)) +
  geom_density2d() +
  theme_minimal() +
  labs(title = '"Contours of a 2d density estimate"',
       caption = "NOTE: change in scale!")

ggplot(coords_bykid, aes(x, y)) +
  stat_density_2d(aes(fill = ..level..), geom = "polygon", colour="white") +
  labs(title = '"Contours of a 2d density estimate" with color',
       caption = "NOTE: change in scale!")

ggplot(coords_bykid, aes(x, y)) +
  geom_hex() +
  scale_fill_continuous(low = "lavender", high = "darkslategray4") +
  theme_minimal() +
  labs(title = '"Hexagonal heatmap of 2d bin counts"') +
  coord_cartesian(xlim = c(-2, 1.5), ylim = c(-3.5, 1)) 

Changing k=2 to k=3

nm_mds_bykid2 <- isoMDS(d = 1 - M_bykid, k = 3)
## initial  value 22.761708 
## iter   5 value 13.329459
## iter  10 value 12.238055
## iter  15 value 11.936425
## iter  20 value 11.789480
## final  value 11.729094 
## converged
coords_bykid2 <- nm_mds_bykid2$points %>% 
  as.data.frame() %>% 
  rename(x = V1, y = V2) %>% 
  rownames_to_column(var = "corpus_child") %>% 
  separate(corpus_child, c("corpus", "child"), sep = "_", remove = FALSE) 


coords_bykid2 <- coords_bykid2 %>% 
  mutate(corpus = ifelse(corpus == "MacWhinney", "McW",
                         ifelse(corpus == "EllisWeismer", "EW",
                                corpus)))
ggplot(coords_bykid2, aes(x, y, label = corpus, color = corpus)) + 
  geom_text() +
  theme_minimal() +
  guides(color = FALSE) +
  coord_cartesian(xlim = c(-2, 1.5), ylim = c(-3.5, 1)) +
  labs(title = "k=3")

ggplot(coords_bykid2, aes(x, y)) +
  geom_bin2d(binwidth = c(0.2, 0.2)) +
  scale_fill_continuous(low = "lavender", high = "darkslategray4") +
  theme_minimal() +
  labs(title = '"Heatmap of 2d bin counts"', caption = "binwidth = 0.2 x 0.2",
       subtitle = "k=3") +
  coord_cartesian(xlim = c(-2, 1.5), ylim = c(-3.5, 1))

ggplot(coords_bykid2, aes(x, y)) +
  geom_density2d() +
  theme_minimal() +
  labs(title = '"Contours of a 2d density estimate"',
       caption = "NOTE: change in scale!",
       subtitle = "k=3")

ggplot(coords_bykid2, aes(x, y)) +
  stat_density_2d(aes(fill = ..level..), geom = "polygon", colour="white") +
  labs(title = '"Contours of a 2d density estimate" with color',
       caption = "NOTE: change in scale!",
       subtitle = "k=3")

ggplot(coords_bykid2, aes(x, y)) +
  geom_hex() +
  scale_fill_continuous(low = "lavender", high = "darkslategray4") +
  theme_minimal() +
  labs(title = '"Hexagonal heatmap of 2d bin counts"',
       subtitle = "k=3") +
  coord_cartesian(xlim = c(-2, 1.5), ylim = c(-3.5, 1)) 

plotting 3 variables

Resources: